{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "fbf7092a",
   "metadata": {},
   "source": [
    "Analysis of TikTok sets labeled just for pranks"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3daad8f8",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "tiktok1 = pd.read_csv('tesstanTikTokset1merged_withrevisedlabels.csv')\n",
    "tiktok2 = pd.read_csv('tesstanTikTokset2merged_withrevisedlabels.csv')\n",
    "\n",
    "print(\"First few rows of tiktok1:\")\n",
    "print(tiktok1.head())\n",
    "\n",
    "print(\"\\nFirst few rows of tiktok2:\")\n",
    "print(tiktok2.head())\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5f861363",
   "metadata": {},
   "outputs": [],
   "source": [
    "tiktok1.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "66253328",
   "metadata": {},
   "outputs": [],
   "source": [
    "tiktok2.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6233099c",
   "metadata": {},
   "outputs": [],
   "source": [
    "columns_of_interest = [\"isPrankTan\", \"isPrankTess\", \"Tanrevised\", \"Tessrevised\"]\n",
    "\n",
    "def print_unique_entries(df, df_name):\n",
    "    print(f\"Unique entries in {df_name}:\")\n",
    "    for column in columns_of_interest:\n",
    "        unique_values = df[column].unique()\n",
    "        print(f\"\\n{column}:\")\n",
    "        print(unique_values)\n",
    "\n",
    "print_unique_entries(tiktok1, \"tiktok1\")\n",
    "\n",
    "print_unique_entries(tiktok2, \"tiktok2\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6e949f13",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "nan_count = tiktok1['isPrankTan'].isna().sum()\n",
    "\n",
    "tiktok1 = tiktok1.dropna(subset=['isPrankTan'])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ab40ed0b",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "def standardize_format(df, columns):\n",
    "    for column in columns:\n",
    "        df[column] = df[column].astype(str)  \n",
    "        df[column] = df[column].str.strip() \n",
    "        df[column] = df[column].replace({'0': '0', '1': '1', 'x': 'x'})  \n",
    "    return df\n",
    "\n",
    "tiktok1 = standardize_format(tiktok1, columns_of_interest)\n",
    "\n",
    "tiktok2 = standardize_format(tiktok2, columns_of_interest)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "37e56846",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "def remove_x_rows(df, column_name, df_name):\n",
    "    x_count = (df[column_name] == 'x').sum()\n",
    "    df = df[df[column_name] != 'x']\n",
    "    print(f\"Number of rows removed from {df_name} due to 'x' in '{column_name}': {x_count}\")\n",
    "    return df\n",
    "\n",
    "tiktok1 = remove_x_rows(tiktok1, 'Tessrevised', 'tiktok1')\n",
    "\n",
    "tiktok2 = remove_x_rows(tiktok2, 'Tessrevised', 'tiktok2')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e4428224",
   "metadata": {},
   "outputs": [],
   "source": [
    "#summary stats\n",
    "tess_non_prank_count_2 = tiktok2['isPrankTess'].value_counts().get('0', 0)\n",
    "tess_prank_count_2 = tiktok2['isPrankTess'].value_counts().get('1', 0)\n",
    "\n",
    "tan_non_prank_count_2 = tiktok2['isPrankTan'].value_counts().get('0', 0)\n",
    "tan_prank_count_2 = tiktok2['isPrankTan'].value_counts().get('1', 0)\n",
    "\n",
    "print(f\"Tess labeled as non-prank (0) in tiktok2: {tess_non_prank_count_2}\")\n",
    "print(f\"Tess labeled as prank (1) in tiktok2: {tess_prank_count_2}\\n\")\n",
    "\n",
    "print(f\"Tan labeled as non-prank (0) in tiktok2: {tan_non_prank_count_2}\")\n",
    "print(f\"Tan labeled as prank (1) in tiktok2: {tan_prank_count_2}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1c9d9405",
   "metadata": {},
   "outputs": [],
   "source": [
    "#percent agree and cohens\n",
    "agreement_2 = tiktok2[tiktok2['isPrankTess'] == tiktok2['isPrankTan']]\n",
    "percent_agreement_2 = len(agreement_2) / len(tiktok2) * 100\n",
    "\n",
    "kappa_2 = cohen_kappa_score(tiktok2['isPrankTess'], tiktok2['isPrankTan'])\n",
    "\n",
    "print(f\"Updated percent agreement between Tess and Tan in tiktok2: {percent_agreement_2:.2f}%\")\n",
    "print(f\"Updated Cohen's kappa between Tess and Tan in tiktok2: {kappa_2:.2f}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3eeccfb1",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Merge tiktok1 and tiktok2\n",
    "merged = pd.concat([tiktok1, tiktok2], ignore_index=True)\n",
    "\n",
    "print(\"First few rows of merged:\")\n",
    "print(merged.head())\n",
    "print(f\"\\nShape of merged: {merged.shape}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "52074788",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Count agree label stats\n",
    "count_ones = merged['Tessrevised'].value_counts().get('1', 0)\n",
    "count_zeros = merged['Tessrevised'].value_counts().get('0', 0)\n",
    "\n",
    "total_rows = len(merged)\n",
    "percentage_ones = (count_ones / total_rows) * 100\n",
    "\n",
    "print(f\"Number of 1s in 'Tessrevised': {count_ones}\")\n",
    "print(f\"Number of 0s in 'Tessrevised': {count_zeros}\")\n",
    "print(f\"Percentage of 1s in 'Tessrevised' from total number of rows: {percentage_ones:.2f}%\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6c2c9d7e",
   "metadata": {},
   "outputs": [],
   "source": [
    "#stats per year\n",
    "percent_1s_per_year = merged.groupby('Year')['Tessrevised'].apply(lambda x: (x == '1').mean() * 100)\n",
    "\n",
    "print(\"Percentage of rows labeled 1 in 'Tessrevised' per year:\")\n",
    "print(percent_1s_per_year)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e3c8920a",
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "\n",
    "years = ['2020', '2021', '2022', '2023']\n",
    "percentages = [5.178571, 5.596107, 5.471125, 4.213483]\n",
    "\n",
    "plt.figure(figsize=(10, 6))\n",
    "plt.plot(years, percentages, marker='o', linestyle='-', color='b')\n",
    "\n",
    "plt.title(\"Percentage of Pranks per Year, Top TikTok Creators\")\n",
    "plt.xlabel(\"Year\")\n",
    "plt.ylabel(\"Percentage of Pranks\")\n",
    "plt.ylim(0, max(percentages) + 2)\n",
    "plt.grid(True)\n",
    "\n",
    "plt.show()\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
